setwd("~/PRQ_contribution_access")
library(tidyverse)
library(randomForest)
load("data_for_ML.RData")
load("registrant_year_metadata.RData")
load("legislator_registrant_year_metadata.RData")
load("legislator_registrant_year_metadata_with_ideology.RData")

#### Separate document-term matrix separate into training set and validation test. ####
dtrain.ML <- data_for_ML[1:1500,]
dtest.ML <- data_for_ML[1501:2000,]
d_uncoded <- data_for_ML[2001:nrow(data_for_ML),]

set.seed(19920525)

#### Building and validating models based on random forest algorithm. ####
# Model to identify requests for access
RF_access_requested <- randomForest(dv_access_requested ~ .,
                                    dtrain.ML %>%
                                      select(-report_sample_id,
                                             -legislator.id,
                                             -contribution,
                                             -dv_access_obtained))
predict(RF_access_requested, dtest.ML) %>%
  table(dtest.ML$dv_access_requested, .) 
predict(RF_access_requested, dtest.ML) %>%
  table(dtest.ML$dv_access_requested, .) %>%
  prop.table(margin = 1)

# Model to identify successful or unsuccessful requests for access from
RF_access_obtained <- randomForest(dv_access_obtained ~ .,
                                   dtrain.ML %>%
                                     select(-report_sample_id,
                                            -legislator.id,
                                            -contribution,
                                            -dv_access_requested))
predict(RF_access_obtained, dtest.ML) %>%
  table(dtest.ML$dv_access_obtained, .) 
predict(RF_access_obtained, dtest.ML) %>%
  table(dtest.ML$dv_access_obtained, .)  %>%
  prop.table(margin = 1)

#### Classify uncoded records. ####
d_uncoded$dv_access_requested <- predict(RF_access_requested, d_uncoded)
d_uncoded$dv_access_obtained <- predict(RF_access_obtained, d_uncoded)
ML_classified_records <- bind_rows(dtrain.ML, dtest.ML, d_uncoded)

# Manually correct and load. Compare with original model classifications.
load("corrected_records.RData")
table(ML_classified_records$dv_access_requested,
      corrected_records$dv_access_requested)
table(ML_classified_records$dv_access_obtained,
      corrected_records$dv_access_obtained)

records_by_report <- corrected_records %>%
  mutate(access = case_when(dv_access_requested == 1 &
                              dv_access_obtained == 1 &
                              staff == 0 ~
                              "3_access_obtained_to_member",
                            dv_access_requested == 1 &
                              dv_access_obtained == 1 &
                              staff == 1 ~
                              "2_access_obtained_to_staff",
                            dv_access_requested == 1 &
                              dv_access_obtained == 0 ~
                              "1_access_requested_unsuccessfully",
                            dv_access_requested == 0 ~
                              "0_access_not_requested")) %>%
  group_by(report_sample_id) %>%
  summarise(legislators_access_requested = list(sort(unique(legislator.id[access != "0_access_not_requested"]))),
            legislators_access_requested_unsuccessfully = list(sort(unique(legislator.id[access == "1_access_requested_unsuccessfully"]))),
            legislators_access_obtained_to_staff = list(sort(unique(legislator.id[access == "2_access_obtained_to_staff"]))),
            legislators_access_obtained_to_member = list(sort(unique(legislator.id[access == "3_access_obtained_to_member"]))),
            legislators_contributed = list(sort(unique(legislator.id[contribution == 1])))) %>%
  ungroup()

#### Merge with registrant-year level metadata. ####
registrant_year_records <- registrant_year_metadata %>%
  unnest(report_sample_id = str_split(report_sample_ids, ";")) %>%
  mutate(report_sample_id = as.numeric(paste(report_sample_id))) %>%
  left_join(., records_by_report,
            by = c("report_sample_id")) %>%
  group_by(registrant.id, year, number_of_clients, number_of_lobbyists) %>%
  summarise(legislators_access_requested = legislators_access_requested %>%
              unlist() %>% unique() %>% sort() %>% list(),
            legislators_access_requested_unsuccessfully = legislators_access_requested_unsuccessfully %>%
              unlist() %>% unique() %>% sort() %>% list(),
            legislators_access_obtained_to_staff = legislators_access_obtained_to_staff %>%
              unlist() %>% unique() %>% sort() %>% list(),
            legislators_access_obtained_to_member = legislators_access_obtained_to_member %>%
              unlist() %>% unique() %>% sort() %>% list(),
            legislators_contributed = legislators_contributed %>%
              unlist() %>% unique() %>% sort() %>% list()) %>%
  ungroup() %>%
  mutate(registrant.id = paste(registrant.id))

#### Merge with legislator-registrant-year level metadata. ####
legislator_registrant_year_data <- legislator_registrant_year_metadata %>%
  left_join(., registrant_year_records %>%
              unnest(legislator_access_requested = legislators_access_requested) %>%
              mutate(legislator_access_requested = paste(legislator_access_requested)) %>%
              distinct(registrant.id, year, legislator_access_requested) %>%
              mutate(access_requested = 1),
            by = c("registrant.id", "year", "legislator.id" = "legislator_access_requested")) %>%
  left_join(., registrant_year_records %>%
              unnest(legislator_access_requested_unsuccessfully = legislators_access_requested_unsuccessfully) %>%
              mutate(legislator_access_requested_unsuccessfully = paste(legislator_access_requested_unsuccessfully)) %>%
              distinct(registrant.id, year, legislator_access_requested_unsuccessfully) %>%
              mutate(access_requested_unsuccessfully = 1),
            by = c("registrant.id", "year", "legislator.id" = "legislator_access_requested_unsuccessfully")) %>%
  left_join(., registrant_year_records %>%
              unnest(legislator_access_obtained_to_staff = legislators_access_obtained_to_staff) %>%
              mutate(legislator_access_obtained_to_staff = paste(legislator_access_obtained_to_staff)) %>%
              distinct(registrant.id, year, legislator_access_obtained_to_staff) %>%
              mutate(access_obtained_to_staff = 1),
            by = c("registrant.id", "year", "legislator.id" = "legislator_access_obtained_to_staff")) %>%
  left_join(., registrant_year_records %>%
              unnest(legislator_access_obtained_to_member = legislators_access_obtained_to_member) %>%
              mutate(legislator_access_obtained_to_member = paste(legislator_access_obtained_to_member)) %>%
              distinct(registrant.id, year, legislator_access_obtained_to_member) %>%
              mutate(access_obtained_to_member = 1),
            by = c("registrant.id", "year", "legislator.id" = "legislator_access_obtained_to_member")) %>%
  left_join(., registrant_year_records %>%
              unnest(legislator_contributed = legislators_contributed) %>%
              mutate(legislator_contributed = paste(legislator_contributed)) %>%
              distinct(registrant.id, year, legislator_contributed) %>%
              mutate(contribution = 1),
            by = c("registrant.id", "year", "legislator.id" = "legislator_contributed")) %>%
  mutate_at(vars(contribution, starts_with("access_")),
            function(x) replace(x, is.na(x), 0)) %>%
  mutate(access = case_when(access_requested == 0 ~ "0_access_not_requested",
                            access_requested == 1 &
                              access_obtained_to_staff == 0 &
                              access_obtained_to_member == 0 ~ "1_access_requested_unsuccessfully",
                            access_requested == 1 &
                              access_obtained_to_staff == 1 &
                              access_obtained_to_member == 0 ~ "2_access_obtained_to_staff",
                            access_requested == 1 &
                              access_obtained_to_member == 1 ~ "3_access_obtained_to_member")) %>%
  select(-starts_with("access_"))

legislator_registrant_year_subset_with_ideology <- legislator_registrant_year_metadata_with_ideology %>%
  left_join(., registrant_year_records %>%
              unnest(legislator_access_requested = legislators_access_requested) %>%
              mutate(legislator_access_requested = paste(legislator_access_requested)) %>%
              distinct(registrant.id, year, legislator_access_requested) %>%
              mutate(access_requested = 1),
            by = c("registrant.id", "year", "legislator.id" = "legislator_access_requested")) %>%
  left_join(., registrant_year_records %>%
              unnest(legislator_access_requested_unsuccessfully = legislators_access_requested_unsuccessfully) %>%
              mutate(legislator_access_requested_unsuccessfully = paste(legislator_access_requested_unsuccessfully)) %>%
              distinct(registrant.id, year, legislator_access_requested_unsuccessfully) %>%
              mutate(access_requested_unsuccessfully = 1),
            by = c("registrant.id", "year", "legislator.id" = "legislator_access_requested_unsuccessfully")) %>%
  left_join(., registrant_year_records %>%
              unnest(legislator_access_obtained_to_staff = legislators_access_obtained_to_staff) %>%
              mutate(legislator_access_obtained_to_staff = paste(legislator_access_obtained_to_staff)) %>%
              distinct(registrant.id, year, legislator_access_obtained_to_staff) %>%
              mutate(access_obtained_to_staff = 1),
            by = c("registrant.id", "year", "legislator.id" = "legislator_access_obtained_to_staff")) %>%
  left_join(., registrant_year_records %>%
              unnest(legislator_access_obtained_to_member = legislators_access_obtained_to_member) %>%
              mutate(legislator_access_obtained_to_member = paste(legislator_access_obtained_to_member)) %>%
              distinct(registrant.id, year, legislator_access_obtained_to_member) %>%
              mutate(access_obtained_to_member = 1),
            by = c("registrant.id", "year", "legislator.id" = "legislator_access_obtained_to_member")) %>%
  left_join(., registrant_year_records %>%
              unnest(legislator_contributed = legislators_contributed) %>%
              mutate(legislator_contributed = paste(legislator_contributed)) %>%
              distinct(registrant.id, year, legislator_contributed) %>%
              mutate(contribution = 1),
            by = c("registrant.id", "year", "legislator.id" = "legislator_contributed")) %>%
  mutate_at(vars(contribution, starts_with("access_")),
            function(x) replace(x, is.na(x), 0)) %>%
  mutate(access = case_when(access_requested == 0 ~ "0_access_not_requested",
                            access_requested == 1 &
                              access_obtained_to_staff == 0 &
                              access_obtained_to_member == 0 ~ "1_access_requested_unsuccessfully",
                            access_requested == 1 &
                              access_obtained_to_staff == 1 &
                              access_obtained_to_member == 0 ~ "2_access_obtained_to_staff",
                            access_requested == 1 &
                              access_obtained_to_member == 1 ~ "3_access_obtained_to_member")) %>%
  select(-starts_with("access_"))